// Copyright 2024 The Google Research Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Information about choosing an aptamer for validation.

syntax = "proto3";

package xxx.aptamers;

// The choice of an aptamer for validation and the reason. Could be a positive
// control, or a mutation created from a known sequence, or both, etc.
message Choice {
  // The sequence (only the random area between the primers) is used here
  // as a unique id for the aptamer. The full aptamer details are stored
  // in an Aptamer proto.
  string aptamer_sequence = 1;

  // The cluster number for this aptamer
  int64 cluster_num = 2;

  // Whether this sequence is in the test or training folds.
  // Clarification: This is the source of this sequence in this particular
  // reason. A given sequence can be in the test set yet also be invented
  // as a mutation off another sequence. In that case, the sequence would
  // have a Source of IN_TEST for the Reason that put it in the
  // validation set as a known sequence and it would have another
  // Reason with a Source of INVENTED when it was created as a
  // mutation off a different sequence.
  // By definition, there should be no sequences that have an Reason
  // of IN_TEST and another Reason of IN_TRAINING.
  enum Source {
    SOURCE_INVENTED = 0;
    SOURCE_IN_TEST = 1;
    SOURCE_IN_VALIDATION = 2;
  }
  Source source = 3;

  // Whether this sequence was picked as a control.
  // Positive controls are sequences known (published) to have high affinity.
  // Negative controls are sequences known to not bind (generally a polyT).
  enum ControlEnum {
    NOT_CONTROL = 0;
    POSITIVE_CONTROL = 1;
    NEGATIVE_CONTROL = 2;
  }
  ControlEnum control = 4;

  // The name of the model used to select to this aptamer. For aptamers
  // picked without using a specific model, this will be missing.
  string model_name = 5;

  // The score of this aptamer sequence in the model_name model. This
  // score is only relevant if a model_name is set. The score is only
  // meaningful within the context of the model_name and shouldn't
  // be compared to scores created by other models.
  float model_score = 6;

  // Information about the mutation that made this sequence, only available
  // if the sequence is made by mutation.
  SeedMutation seed_mutation = 7;

  // Information about the global optimization step that picked this
  // sequence, only available if the sequence is made as part of the
  // global optimization.
  GlobalMutation global_mutation = 8;
}

// The methods used to generate a new sequence from an existing
// sequence.
enum MutationTypeEnum {
  // The method when mutations are made without any guiding model
  METHOD_RANDOM = 0;

  // At each step, generate many possible mutations from the best sequences in
  // the previous step randomly then score each and pick the best new sequences.
  METHOD_SIMPLE_SAMPLING = 1;

  // Genetic algorithm for sampling: start with a population of sequences;
  // at each generation pick some of the population to reproduce (create
  // mutant sequences) then limit the population to sequences with the
  // highest fitness (regardless of which generation the sequence is from).
  // Unlike a standard genetic algorithm, a mutant child has only one parent.
  // TODO for the future: try to incorporate two parents (recombination),
  // in a way that doesn't disrupt secondary structure too much.
  METHOD_GENETIC_ALGORITHM = 2;
}

message SeedMutation {
  // The number of steps aways from the seed sequence. Seed sequences
  // will have a step = 0 and no previous_sequence.
  int64 step = 1;

  // The sequence that was mutated to make this sequence.
  string previous_sequence = 2;

  // The original seed sequence. Can be used as an ID to gather all the
  // sequences made from the same seed.
  string seed_sequence = 3;

  // How this sequence step was picked.
  MutationTypeEnum mutation_type = 4;
}

// TODO(mdimon): determine what fields should go into this proto once we have a
// general global optimization scheme picked out.
message GlobalMutation {
  // the step number of the global optimization
  int64 step = 1;
}
